Import COVID Data

url_in <- "https://github.com/CSSEGISandData/COVID-19/raw/refs/heads/master/csse_covid_19_data/csse_covid_19_time_series/"

filenames <- 
  c("time_series_19-covid-Confirmed.csv",
    "time_series_covid19_confirmed_US.csv",
    "time_series_covid19_confirmed_global.csv",
    "time_series_covid19_deaths_US.csv",
    "time_series_covid19_deaths_global.csv")
urls <- str_c(url_in, filenames)

global_cases <- read_csv(urls[3])
## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global_deaths <- read_csv(urls[5])
## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
us_cases <- read_csv(urls[2])
## Rows: 3342 Columns: 1154
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1148): UID, code3, FIPS, Lat, Long_, 1/22/20, 1/23/20, 1/24/20, 1/25/20...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
us_deaths <- read_csv(urls[4])
## Rows: 3342 Columns: 1155
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1149): UID, code3, FIPS, Lat, Long_, Population, 1/22/20, 1/23/20, 1/24...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Create tidy data sets

global_cases_wrangle <- global_cases %>%
  pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
               names_to = "date",
               values_to = "cases") %>%
  mutate(date = mdy(date)) %>%
  select(-c(Lat, Long))
global_deaths_wrangle <- global_deaths %>%
  pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
               names_to = "date",
               values_to = "deaths") %>%
  mutate(date = mdy(date)) %>%
  select(-c(Lat, Long))
#  filter(deaths != 0)
us_cases_wrangle <- us_cases %>%
  select(-c(UID, iso2, iso3,code3, FIPS, Lat, Long_)) %>%
  rename(City = Admin2, State = Province_State, Country = Country_Region) %>%
  pivot_longer(cols = -c(City, State, Country, Combined_Key),
               names_to = "date",
               values_to = "cases") %>%
  mutate(date = mdy(date))
us_deaths_wrangle <- us_deaths %>%
  select(-c(UID, iso2, iso3,code3, FIPS, Lat, Long_)) %>%
  rename(City = Admin2, State = Province_State, Country = Country_Region) %>%
  pivot_longer(cols = -c(City, State, Country, Combined_Key, Population),
               names_to = "date",
               values_to = "deaths") %>%
  mutate(date = mdy(date))
#  filter(deaths != 0)
global <- global_cases_wrangle %>%
  full_join(global_deaths_wrangle)
## Joining with `by = join_by(`Province/State`, `Country/Region`, date)`
us <- us_cases_wrangle %>%
  full_join(us_deaths_wrangle, by = c("City", "State", "Country", "Combined_Key", "date"))
  1. Insights from the US Data Visualization of Observed Cases and Deaths

The visualization of observed daily cases and deaths in the United States reveals critical trends in the progression of the COVID-19 pandemic. Peaks correspond to significant waves of infections, driven by new variants, holiday gatherings, or waning immunity. The lag between case surges and death spikes highlights the progression of the disease and its impact on healthcare resources.

The ARIMA model provided a reliable forecast of daily new deaths in the U.S., capturing seasonality and trend changes. Such forecasts can assist first responders by:

  1. Anticipating resource needs (e.g., ICU beds and staff allocation).
  2. Guiding policy interventions during surges.
  3. Supporting proactive measures like emergency planning and supply chain management.
# Load the dataset
data <- us

# Convert date to Date format
data$date <- as.Date(data$date, format = "%m/%d/%y")

# Ensure daily new cases and new deaths calculation if they are cumulative
data <- data %>%
  group_by(Country, State) %>%
  arrange(date) %>%
  mutate(
    new_cases = cases - lag(cases, default = 0),
    new_deaths = deaths - lag(deaths, default = 0)
  ) %>%
  ungroup()

# Filter the dataset for the US starting from 2020 and calculate daily new deaths and new cases
us_data <- data %>%
  filter(Country == "US", date >= as.Date("2020-01-01")) %>%
  group_by(date) %>%
  summarize(
    Total_New_Deaths = sum(new_deaths, na.rm = TRUE),
    Total_New_Cases = sum(new_cases, na.rm = TRUE),
    .groups = 'drop'
  )

# Prepare data for faceting
us_data_long <- us_data %>%
  pivot_longer(
    cols = c(Total_New_Cases, Total_New_Deaths),
    names_to = "Metric",
    values_to = "Value"
  )

# Visualize the observed data with facets
library(ggplot2)

ggplot(us_data_long, aes(x = date, y = Value, color = Metric)) +
  geom_line() +
  facet_wrap(~Metric, ncol = 1, scales = "free_y") +
  ggtitle("Observed Daily New Cases and Deaths in the US") +
  xlab("Date") +
  ylab("Count") +
  theme_minimal() +
  theme(legend.position = "none")

# Fit the ARIMA model for new deaths
library(forecast)
arima_model_deaths <- auto.arima(us_data$Total_New_Deaths, seasonal = TRUE, stepwise = TRUE, approximation = FALSE)

# Print the model summary
summary(arima_model_deaths)
## Series: us_data$Total_New_Deaths 
## ARIMA(4,1,4) 
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ma1     ma2     ma3      ma4
##       0.3354  -0.0274  -0.6808  -0.1615  -1.1922  0.1871  0.8263  -0.6656
## s.e.  0.0516   0.0581   0.0535   0.0371   0.0451  0.0820  0.0793   0.0426
## 
## sigma^2 = 177.1:  log likelihood = -4574.06
## AIC=9166.11   AICc=9166.27   BIC=9211.48
## 
## Training set error measures:
##                      ME     RMSE     MAE MPE MAPE      MASE        ACF1
## Training set 0.05854253 13.25594 6.22893 NaN  Inf 0.7061886 0.006993717
# Forecast the next 30 days for new deaths
forecast_deaths <- forecast(arima_model_deaths, h = 30)

# Create a dataframe for the forecast with actual dates
forecast_deaths_df <- data.frame(
  Date = seq(max(us_data$date) + 1, by = "day", length.out = 30),
  Forecast = forecast_deaths$mean,
  Lower_80 = forecast_deaths$lower[, 1],
  Upper_80 = forecast_deaths$upper[, 1],
  Lower_95 = forecast_deaths$lower[, 2],
  Upper_95 = forecast_deaths$upper[, 2]
)

# Plot the forecast with actual dates
library(ggplot2)
ggplot() +
  geom_line(data = us_data, aes(x = date, y = Total_New_Deaths, color = "Observed")) +
  geom_line(data = forecast_deaths_df, aes(x = Date, y = Forecast, color = "Forecast")) +
  geom_ribbon(data = forecast_deaths_df, aes(x = Date, ymin = Lower_95, ymax = Upper_95), fill = "blue", alpha = 0.2) +
  geom_ribbon(data = forecast_deaths_df, aes(x = Date, ymin = Lower_80, ymax = Upper_80), fill = "lightblue", alpha = 0.4) +
  ggtitle("ARIMA Forecast for Daily New Deaths in the US") +
  xlab("Date") +
  ylab("New Deaths") +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal() +
  theme(legend.title = element_blank())

  1. Observations from the Cumulative Cases and Deaths Heatmaps in the U.S. The cumulative heatmaps highlighted geographic disparities in the pandemic’s impact. Regions with high cumulative cases and deaths relative to population percentages may indicate vulnerabilities such as healthcare infrastructure limitations, demographic factors, or varying levels of compliance with public health guidelines.
# Load the dataset
data <- us

# Convert date to Date format
data$date <- as.Date(data$date, format = "%m/%d/%y")

# Ensure cumulative cases and deaths are converted to daily values
data <- data %>%
  group_by(Country, State) %>%
  arrange(date) %>%
  mutate(
    daily_cases = cases - lag(cases, default = 0),
    daily_deaths = deaths - lag(deaths, default = 0)
  ) %>%
  ungroup()

# Filter the dataset for the US starting from 2020 and calculate cumulative cases and deaths
state_cumulative_data <- data %>%
  filter(Country == "US", date >= as.Date("2020-01-01")) %>%
  group_by(State, date) %>%
  summarize(
    Cumulative_Cases = sum(cases, na.rm = TRUE),
    Cumulative_Deaths = sum(deaths, na.rm = TRUE),
    Population = signif(max(Population, na.rm = TRUE), 6),
    .groups = 'drop'
  ) %>%
  mutate(
    Cases_Percentage = round((Cumulative_Cases / Population) * 100, 2),
    Deaths_Percentage = round((Cumulative_Deaths / Population) * 100, 2)
  )

# Get US states map data
us_states <- map_data("state")

# Get outer boundary of the United States
us_outer_boundary <- us_states %>%
  group_by(group) %>%
  filter(n() > 1)

# Merge COVID data with US states map data
state_cumulative_data <- state_cumulative_data %>%
  mutate(State = tolower(State))  # Ensure state names are lowercase to match map data

merged_data <- left_join(us_states, state_cumulative_data, by = c("region" = "State"))
## Warning in left_join(us_states, state_cumulative_data, by = c(region = "State")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Create the base heatmap visualization
us_map <- ggplot(merged_data, aes(x = long, y = lat, group = group, fill = Cases_Percentage)) +
  geom_polygon(color = "white") +  # Remove internal state borders
  geom_path(data = us_outer_boundary, aes(x = long, y = lat, group = group), color = "white", size = 0.7, inherit.aes = FALSE) +  # Add outer US outline
  scale_fill_gradientn(
    name = "% of Population",
    colors = c("white", "yellow", "lightblue", "lightgreen", "orange", "red"),
    values = scales::rescale(c(0, 0.01, 0.05, 0.1, 0.5, 1)),
    na.value = "grey50"  # Handle missing values
  ) +
  labs(
    title = "Cumulative Cases as % of Population by State: {frame_time}",
    subtitle = "Progression Over Time",
    caption = "Source: US Dataset"
  ) +
  coord_fixed(1.3) +  # Fix aspect ratio
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    legend.position = "right"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Add animation to the heatmap over time
animated_map <- us_map +
  transition_time(date) +  # Animate by day
  labs(title = "Cumulative Cases as % of Population by State: {frame_time}")

# Render the animation with an end pause
animate(animated_map, nframes = 100, fps = 5, end_pause = 20, renderer = gifski_renderer())

# Save the animation as a GIF
anim_save("covid_US_newcases_heatmap_percentage_animation.gif", animation = last_animation())
# Load the dataset
data <- us

# Convert date to Date format
data$date <- as.Date(data$date, format = "%m/%d/%y")

# Ensure cumulative cases and deaths are converted to daily values
data <- data %>%
  group_by(Country, State) %>%
  arrange(date) %>%
  mutate(
    daily_cases = cases - lag(cases, default = 0),
    daily_deaths = deaths - lag(deaths, default = 0)
  ) %>%
  ungroup()

# Filter the dataset for the US starting from 2020 and calculate cumulative cases and deaths
state_cumulative_data <- data %>%
  filter(Country == "US", date >= as.Date("2020-01-01")) %>%
  group_by(State, date) %>%
  summarize(
    Cumulative_Cases = sum(cases, na.rm = TRUE),
    Cumulative_Deaths = sum(deaths, na.rm = TRUE),
    Population = signif(max(Population, na.rm = TRUE), 6),
    .groups = 'drop'
  ) %>%
  mutate(
    Cases_Percentage = round((Cumulative_Cases / Population) * 100, 2),
    Deaths_Percentage = round((Cumulative_Deaths / Population) * 100, 2)
  )

# Get US states map data
us_states <- map_data("state")

# Get outer boundary of the United States
us_outer_boundary <- us_states %>%
  group_by(group) %>%
  filter(n() > 1)

# Merge COVID data with US states map data
state_cumulative_data <- state_cumulative_data %>%
  mutate(State = tolower(State))  # Ensure state names are lowercase to match map data

merged_data <- left_join(us_states, state_cumulative_data, by = c("region" = "State"))
## Warning in left_join(us_states, state_cumulative_data, by = c(region = "State")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Create the base heatmap visualization
us_map <- ggplot(merged_data, aes(x = long, y = lat, group = group, fill = Deaths_Percentage)) +
  geom_polygon(color = "white") +  # Remove internal state borders
  geom_path(data = us_outer_boundary, aes(x = long, y = lat, group = group), color = "white", size = 0.7, inherit.aes = FALSE) +  # Add outer US outline
  scale_fill_gradientn(
    name = "% of Population",
    colors = c("white", "yellow", "lightblue", "lightgreen", "orange", "red"),
    values = scales::rescale(c(0, 0.01, 0.05, 0.1, 0.5, 1)),
    na.value = "grey50"  # Handle missing values
  ) +
  labs(
    title = "Cumulative Deaths as % of Population by State: {frame_time}",
    subtitle = "Progression Over Time",
    caption = "Source: US Dataset"
  ) +
  coord_fixed(1.3) +  # Fix aspect ratio
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    legend.position = "right"
  )

# Add animation to the heatmap over time
animated_map <- us_map +
  transition_time(date) +  # Animate by day
  labs(title = "Cumulative Deaths as % of Population by State: {frame_time}")

# Render the animation with an end pause
animate(animated_map, nframes = 100, fps = 5, end_pause = 20, renderer = gifski_renderer())

# Save the animation as a GIF
anim_save("covid_US_heatmap_cumulative_deaths_percentage_animation.gif", animation = last_animation())
  1. Insights from the Global Data Visualization of Observed Cases and Deaths The global visualization underscored uneven pandemic impacts across countries. Peaks reflect significant global waves, often correlating with the spread of variants like Delta and Omicron. It highlights regions with delayed case reporting and death surges, emphasizing the need for synchronized global efforts.

The ARIMA forecast for global data demonstrated its adaptability to larger datasets. Potential applications include:

  1. Early warnings for emerging hotspots.
  2. Optimized distribution of global medical resources.
  3. Informing travel advisories and cross-border health policies.
# Load the dataset
data <- global

# Convert date to Date format
data$date <- as.Date(data$date)

# Ensure daily new cases and new deaths calculation if they are cumulative
data <- data %>%
  arrange(date) %>%
  group_by(`Country/Region`, `Province/State`) %>%
  mutate(
    new_cases = cases - lag(cases, default = 0),
    new_deaths = deaths - lag(deaths, default = 0)
  ) %>%
  ungroup()

# Check for unexpected values in new cases and new deaths
data <- data %>%
  mutate(
    new_cases = ifelse(new_cases < 0, NA, new_cases),  # Remove negative values
    new_deaths = ifelse(new_deaths < 0, NA, new_deaths)  # Remove negative values
  )

# Filter the dataset globally starting from 2020 and calculate daily new deaths and new cases
global_data <- data %>%
  filter(date >= as.Date("2020-01-01")) %>%
  group_by(date) %>%
  summarize(
    Total_New_Deaths = sum(new_deaths, na.rm = TRUE),
    Total_New_Cases = sum(new_cases, na.rm = TRUE),
    .groups = 'drop'
  )

# Validate the aggregation results
print(summary(global_data$Total_New_Deaths))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    2180    5849    6028    8773   60903
print(summary(global_data$Total_New_Cases))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     100  265196  476495  592793  677628 4083281
# Prepare data for faceting
global_data_long <- global_data %>%
  pivot_longer(
    cols = c(Total_New_Cases, Total_New_Deaths),
    names_to = "Metric",
    values_to = "Value"
  )

# Visualize the observed data with facets
library(ggplot2)

ggplot(global_data_long, aes(x = date, y = Value, color = Metric)) +
  geom_line() +
  facet_wrap(~Metric, ncol = 1, scales = "free_y") +
  ggtitle("Observed Daily New Cases and Deaths Globally") +
  xlab("Date") +
  ylab("Count") +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal() +
  theme(legend.position = "none")

# Fit the ARIMA model for new deaths
library(forecast)
arima_model_deaths <- auto.arima(global_data$Total_New_Deaths, seasonal = TRUE, stepwise = TRUE, approximation = FALSE)

# Print the model summary
summary(arima_model_deaths)
## Series: global_data$Total_New_Deaths 
## ARIMA(5,1,2) 
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ma1     ma2
##       0.2076  -0.4615  -0.3542  -0.2962  -0.2818  -1.0503  0.5850
## s.e.  0.0738   0.0367   0.0306   0.0298   0.0345   0.0672  0.0872
## 
## sigma^2 = 4916807:  log likelihood = -10416.19
## AIC=20848.38   AICc=20848.51   BIC=20888.7
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 2.871213 2209.614 807.6754 -16.43327 30.38921 0.6818144
##                      ACF1
## Training set -0.005524948
# Forecast the next 30 days for new deaths
forecast_deaths <- forecast(arima_model_deaths, h = 30)

# Create a dataframe for the forecast with actual dates
forecast_deaths_df <- data.frame(
  Date = seq(max(global_data$date) + 1, by = "day", length.out = 30),
  Forecast = forecast_deaths$mean,
  Lower_80 = forecast_deaths$lower[, 1],
  Upper_80 = forecast_deaths$upper[, 1],
  Lower_95 = forecast_deaths$lower[, 2],
  Upper_95 = forecast_deaths$upper[, 2]
)

# Plot the forecast with actual dates
ggplot() +
  geom_line(data = global_data, aes(x = date, y = Total_New_Deaths, color = "Observed")) +
  geom_line(data = forecast_deaths_df, aes(x = Date, y = Forecast, color = "Forecast")) +
  geom_ribbon(data = forecast_deaths_df, aes(x = Date, ymin = Lower_95, ymax = Upper_95), fill = "blue", alpha = 0.2) +
  geom_ribbon(data = forecast_deaths_df, aes(x = Date, ymin = Lower_80, ymax = Upper_80), fill = "lightblue", alpha = 0.4) +
  ggtitle("ARIMA Forecast for Daily New Deaths Globally") +
  xlab("Date") +
  ylab("New Deaths") +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal() +
  theme(legend.title = element_blank())

  1. Potential Areas of Bias
  1. Data Reporting: Inconsistent or incomplete reporting by countries or states introduces bias.
  2. Population Differences: Data normalized by population does not account for demographic factors like age or underlying health conditions.
  3. Forecast Assumptions: ARIMA relies on historical patterns, which may not predict unprecedented events (e.g., new variants or sudden intervention measures).
  1. Potential Areas of Concern
  1. Underreporting: Many regions might underreport due to limited resources or political considerations.
  2. Model Limitations: ARIMA models do not incorporate external variables like vaccine effectiveness or behavioral changes.
  3. Ethical Use of Data: Ensuring equity in resource allocation and decision-making based on predictive data is crucial.
  1. Key Questions and Potential Areas to Explore
  1. How can real-time data reporting be improved globally?
  2. What additional factors (e.g., vaccination rates, mobility data) should be incorporated into forecasting models?
  3. How do interventions (e.g., mask mandates, lockdowns) alter the validity of ARIMA models?
  4. What lessons can be learned to enhance preparedness for future pandemics?
  1. Summary The analysis integrates data visualization and ARIMA modeling to uncover trends and make informed predictions about COVID-19’s impact. While the tools provided valuable insights, addressing biases and limitations is critical for actionable outcomes. These findings emphasize the need for global coordination, improved data collection, and equity-focused strategies to mitigate health crises effectively.